from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_openai import  OpenAIEmbeddings
from langchain_chroma import Chroma

loader = PyPDFLoader("../pdf/Java面试准备指南.pdf")

documents = loader.load()


text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000, chunk_overlap=200, add_start_index=True
)
# 对文档进行分块
all_splits = text_splitter.split_documents(documents)

# 实例化向量数据库
vector_db = Chroma.from_documents(all_splits,embedding=OpenAIEmbeddings(model="text-embedding-3-large"))

# 查询向量数据库
result =  vector_db.similarity_search("spring")
print(result)